{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import datetime\n",
    "import time\n",
    "import google.generativeai as genai\n",
    "from google.generativeai import caching\n",
    "from dotenv import load_dotenv\n",
    "from firecrawl import FirecrawlApp\n",
    "import json\n",
    "\n",
    "# Load environment variables\n",
    "load_dotenv()\n",
    "\n",
    "# Retrieve API keys from environment variables\n",
    "google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n",
    "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
    "\n",
    "# Configure the Google Generative AI module with the API key\n",
    "genai.configure(api_key=google_api_key)\n",
    "\n",
    "# Initialize the FirecrawlApp with your API key\n",
    "app = FirecrawlApp(api_key=firecrawl_api_key)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No data returned from crawl.\n"
     ]
    }
   ],
   "source": [
    "# Crawl a website\n",
    "crawl_url = 'https://dify.ai/'\n",
    "params = {\n",
    "   \n",
    "    'crawlOptions': {\n",
    "        'limit': 100\n",
    "    }\n",
    "}\n",
    "crawl_result = app.crawl_url(crawl_url, params=params)\n",
    "\n",
    "if crawl_result is not None:\n",
    "    # Convert crawl results to JSON format, excluding 'content' field from each entry\n",
    "    cleaned_crawl_result = [{k: v for k, v in entry.items() if k != 'content'} for entry in crawl_result]\n",
    "\n",
    "    # Save the modified results as a text file containing JSON data\n",
    "    with open('crawl_result.txt', 'w') as file:\n",
    "        file.write(json.dumps(cleaned_crawl_result, indent=4))\n",
    "else:\n",
    "    print(\"No data returned from crawl.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Upload the video using the Files API\n",
    "text_file = genai.upload_file(path=\"crawl_result.txt\")\n",
    "\n",
    "# Wait for the file to finish processing\n",
    "while text_file.state.name == \"PROCESSING\":\n",
    "    print('Waiting for file to be processed.')\n",
    "    time.sleep(2)\n",
    "    text_file = genai.get_file(text_file.name)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a cache with a 5 minute TTL\n",
    "cache = caching.CachedContent.create(\n",
    "    model=\"models/gemini-1.5-flash-002\",\n",
    "    display_name=\"website crawl testing again\", # used to identify the cache\n",
    "    system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
    "    contents=[text_file],\n",
    "    ttl=datetime.timedelta(minutes=15),\n",
    ")\n",
    "# Construct a GenerativeModel which uses the created cache.\n",
    "model = genai.GenerativeModel.from_cached_content(cached_content=cache)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dify.AI utilizes the **Firecrawl** service for website scraping. This service can crawl and convert any website into clean markdown or structured data that's ready for use in building RAG applications. \n",
      "\n",
      "Here's how Firecrawl helps:\n",
      "\n",
      "* **Crawling and Conversion:** Firecrawl crawls the website and converts the content into a format that is easily understood by LLMs, such as markdown or structured data.\n",
      "* **Clean Output:**  Firecrawl ensures the data is clean and free of errors, making it easier to use in Dify's RAG engine.\n",
      "* **Parallel Crawling:**  Firecrawl efficiently crawls web pages in parallel, delivering results quickly.\n",
      "\n",
      "You can find Firecrawl on their website: [https://www.firecrawl.dev/](https://www.firecrawl.dev/)\n",
      "\n",
      "Firecrawl offers both a cloud service and an open-source software (OSS) edition. \n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Query the model\n",
    "response = model.generate_content([\"What powers website scraping with Dify?\"])\n",
    "response_dict = response.to_dict()\n",
    "response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n",
    "print(response_text)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
